import pandas as pd
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
conn = sqlite3.connect("olist.sqlite")
cur= conn.cursor()
query = "SELECT name FROM sqlite_master WHERE type='table'"
cur.execute(query)
cur.fetchall()
[('product_category_name_translation',),
('sellers',),
('customers',),
('geolocation',),
('order_items',),
('order_payments',),
('order_reviews',),
('orders',),
('products',),
('leads_qualified',),
('leads_closed',)]
query = "PRAGMA table_info(order_items)"
pd.read_sql(query, conn)
| cid | name | type | notnull | dflt_value | pk | |
|---|---|---|---|---|---|---|
| 0 | 0 | order_id | TEXT | 0 | None | 0 |
| 1 | 1 | order_item_id | INTEGER | 0 | None | 0 |
| 2 | 2 | product_id | TEXT | 0 | None | 0 |
| 3 | 3 | seller_id | TEXT | 0 | None | 0 |
| 4 | 4 | shipping_limit_date | TEXT | 0 | None | 0 |
| 5 | 5 | price | REAL | 0 | None | 0 |
| 6 | 6 | freight_value | REAL | 0 | None | 0 |
query = "SELECT COUNT(*) FROM order_items;"
pd.read_sql(query, conn)
| COUNT(*) | |
|---|---|
| 0 | 112650 |
query = "SELECT * FROM order_items LIMIT 10;"
pd.read_sql(query, conn)
| order_id | order_item_id | product_id | seller_id | shipping_limit_date | price | freight_value | |
|---|---|---|---|---|---|---|---|
| 0 | 00010242fe8c5a6d1ba2dd792cb16214 | 1 | 4244733e06e7ecb4970a6e2683c13e61 | 48436dade18ac8b2bce089ec2a041202 | 2017-09-19 09:45:35 | 58.90 | 13.29 |
| 1 | 00018f77f2f0320c557190d7a144bdd3 | 1 | e5f2d52b802189ee658865ca93d83a8f | dd7ddc04e1b6c2c614352b383efe2d36 | 2017-05-03 11:05:13 | 239.90 | 19.93 |
| 2 | 000229ec398224ef6ca0657da4fc703e | 1 | c777355d18b72b67abbeef9df44fd0fd | 5b51032eddd242adc84c38acab88f23d | 2018-01-18 14:48:30 | 199.00 | 17.87 |
| 3 | 00024acbcdf0a6daa1e931b038114c75 | 1 | 7634da152a4610f1595efa32f14722fc | 9d7a1d34a5052409006425275ba1c2b4 | 2018-08-15 10:10:18 | 12.99 | 12.79 |
| 4 | 00042b26cf59d7ce69dfabb4e55b4fd9 | 1 | ac6c3623068f30de03045865e4e10089 | df560393f3a51e74553ab94004ba5c87 | 2017-02-13 13:57:51 | 199.90 | 18.14 |
| 5 | 00048cc3ae777c65dbb7d2a0634bc1ea | 1 | ef92defde845ab8450f9d70c526ef70f | 6426d21aca402a131fc0a5d0960a3c90 | 2017-05-23 03:55:27 | 21.90 | 12.69 |
| 6 | 00054e8431b9d7675808bcb819fb4a32 | 1 | 8d4f2bb7e93e6710a28f34fa83ee7d28 | 7040e82f899a04d1b434b795a43b4617 | 2017-12-14 12:10:31 | 19.90 | 11.85 |
| 7 | 000576fe39319847cbb9d288c5617fa6 | 1 | 557d850972a7d6f792fd18ae1400d9b6 | 5996cddab893a4652a15592fb58ab8db | 2018-07-10 12:30:45 | 810.00 | 70.75 |
| 8 | 0005a1a1728c9d785b8e2b08b904576c | 1 | 310ae3c140ff94b03219ad0adc3c778f | a416b6a846a11724393025641d4edd5e | 2018-03-26 18:31:29 | 145.95 | 11.65 |
| 9 | 0005f50442cb953dcd1d21e1fb923495 | 1 | 4535b0e1091c278dfd193e5a1d63b39f | ba143b05f0110f0dc71ad71b4466ce92 | 2018-07-06 14:10:56 | 53.99 | 11.40 |
query = "PRAGMA table_info(product_category_name_translation)"
pd.read_sql(query, conn)
| cid | name | type | notnull | dflt_value | pk | |
|---|---|---|---|---|---|---|
| 0 | 0 | product_category_name | TEXT | 0 | None | 0 |
| 1 | 1 | product_category_name_english | TEXT | 0 | None | 0 |
query = "SELECT * FROM product_category_name_translation LIMIT 5;"
pd.read_sql(query, conn)
| product_category_name | product_category_name_english | |
|---|---|---|
| 0 | beleza_saude | health_beauty |
| 1 | informatica_acessorios | computers_accessories |
| 2 | automotivo | auto |
| 3 | cama_mesa_banho | bed_bath_table |
| 4 | moveis_decoracao | furniture_decor |
query = "PRAGMA table_info(sellers)"
pd.read_sql(query, conn)
| cid | name | type | notnull | dflt_value | pk | |
|---|---|---|---|---|---|---|
| 0 | 0 | seller_id | TEXT | 0 | None | 0 |
| 1 | 1 | seller_zip_code_prefix | INTEGER | 0 | None | 0 |
| 2 | 2 | seller_city | TEXT | 0 | None | 0 |
| 3 | 3 | seller_state | TEXT | 0 | None | 0 |
query = "SELECT * FROM sellers LIMIT 5;"
pd.read_sql(query, conn)
| seller_id | seller_zip_code_prefix | seller_city | seller_state | |
|---|---|---|---|---|
| 0 | 3442f8959a84dea7ee197c632cb2df15 | 13023 | campinas | SP |
| 1 | d1b65fc7debc3361ea86b5f14c68d2e2 | 13844 | mogi guacu | SP |
| 2 | ce3ad9de960102d0677a81f5d0bb7b2d | 20031 | rio de janeiro | RJ |
| 3 | c0f3eea2e14555b6faeea3dd58c1b1c3 | 4195 | sao paulo | SP |
| 4 | 51a04a8a6bdcb23deccc82b0b80742cf | 12914 | braganca paulista | SP |
query = "PRAGMA table_info(customers)"
pd.read_sql(query, conn)
| cid | name | type | notnull | dflt_value | pk | |
|---|---|---|---|---|---|---|
| 0 | 0 | customer_id | TEXT | 0 | None | 0 |
| 1 | 1 | customer_unique_id | TEXT | 0 | None | 0 |
| 2 | 2 | customer_zip_code_prefix | INTEGER | 0 | None | 0 |
| 3 | 3 | customer_city | TEXT | 0 | None | 0 |
| 4 | 4 | customer_state | TEXT | 0 | None | 0 |
query = "SELECT * FROM customers LIMIT 5;"
pd.read_sql(query, conn)
| customer_id | customer_unique_id | customer_zip_code_prefix | customer_city | customer_state | |
|---|---|---|---|---|---|
| 0 | 06b8999e2fba1a1fbc88172c00ba8bc7 | 861eff4711a542e4b93843c6dd7febb0 | 14409 | franca | SP |
| 1 | 18955e83d337fd6b2def6b18a428ac77 | 290c77bc529b7ac935b93aa66c333dc3 | 9790 | sao bernardo do campo | SP |
| 2 | 4e7b3e00288586ebd08712fdd0374a03 | 060e732b5b29e8181a18229c7b0b2b5e | 1151 | sao paulo | SP |
| 3 | b2b6027bc5c5109e529d4dc6358b12c3 | 259dac757896d24d7702b9acbbff3f3c | 8775 | mogi das cruzes | SP |
| 4 | 4f2d8ab171c80ec8364f7c12e35b23ad | 345ecd01c38d18a9036ed96c73b8d066 | 13056 | campinas | SP |
query = "PRAGMA table_info(geolocation)"
pd.read_sql(query, conn)
| cid | name | type | notnull | dflt_value | pk | |
|---|---|---|---|---|---|---|
| 0 | 0 | geolocation_zip_code_prefix | INTEGER | 0 | None | 0 |
| 1 | 1 | geolocation_lat | REAL | 0 | None | 0 |
| 2 | 2 | geolocation_lng | REAL | 0 | None | 0 |
| 3 | 3 | geolocation_city | TEXT | 0 | None | 0 |
| 4 | 4 | geolocation_state | TEXT | 0 | None | 0 |
query = "SELECT * FROM geolocation LIMIT 5;"
pd.read_sql(query, conn)
| geolocation_zip_code_prefix | geolocation_lat | geolocation_lng | geolocation_city | geolocation_state | |
|---|---|---|---|---|---|
| 0 | 1037 | -23.545621 | -46.639292 | sao paulo | SP |
| 1 | 1046 | -23.546081 | -46.644820 | sao paulo | SP |
| 2 | 1046 | -23.546129 | -46.642951 | sao paulo | SP |
| 3 | 1041 | -23.544392 | -46.639499 | sao paulo | SP |
| 4 | 1035 | -23.541578 | -46.641607 | sao paulo | SP |
query = "PRAGMA table_info(order_payments)"
pd.read_sql(query, conn)
| cid | name | type | notnull | dflt_value | pk | |
|---|---|---|---|---|---|---|
| 0 | 0 | order_id | TEXT | 0 | None | 0 |
| 1 | 1 | payment_sequential | INTEGER | 0 | None | 0 |
| 2 | 2 | payment_type | TEXT | 0 | None | 0 |
| 3 | 3 | payment_installments | INTEGER | 0 | None | 0 |
| 4 | 4 | payment_value | REAL | 0 | None | 0 |
query = "SELECT * FROM order_payments LIMIT 10;"
pd.read_sql(query, conn)
| order_id | payment_sequential | payment_type | payment_installments | payment_value | |
|---|---|---|---|---|---|
| 0 | b81ef226f3fe1789b1e8b2acac839d17 | 1 | credit_card | 8 | 99.33 |
| 1 | a9810da82917af2d9aefd1278f1dcfa0 | 1 | credit_card | 1 | 24.39 |
| 2 | 25e8ea4e93396b6fa0d3dd708e76c1bd | 1 | credit_card | 1 | 65.71 |
| 3 | ba78997921bbcdc1373bb41e913ab953 | 1 | credit_card | 8 | 107.78 |
| 4 | 42fdf880ba16b47b59251dd489d4441a | 1 | credit_card | 2 | 128.45 |
| 5 | 298fcdf1f73eb413e4d26d01b25bc1cd | 1 | credit_card | 2 | 96.12 |
| 6 | 771ee386b001f06208a7419e4fc1bbd7 | 1 | credit_card | 1 | 81.16 |
| 7 | 3d7239c394a212faae122962df514ac7 | 1 | credit_card | 3 | 51.84 |
| 8 | 1f78449c87a54faf9e96e88ba1491fa9 | 1 | credit_card | 6 | 341.09 |
| 9 | 0573b5e23cbd798006520e1d5b4c6714 | 1 | boleto | 1 | 51.95 |
query = "PRAGMA table_info(order_reviews)"
pd.read_sql(query, conn)
| cid | name | type | notnull | dflt_value | pk | |
|---|---|---|---|---|---|---|
| 0 | 0 | review_id | TEXT | 0 | None | 0 |
| 1 | 1 | order_id | TEXT | 0 | None | 0 |
| 2 | 2 | review_score | INTEGER | 0 | None | 0 |
| 3 | 3 | review_comment_title | TEXT | 0 | None | 0 |
| 4 | 4 | review_comment_message | TEXT | 0 | None | 0 |
| 5 | 5 | review_creation_date | TEXT | 0 | None | 0 |
| 6 | 6 | review_answer_timestamp | TEXT | 0 | None | 0 |
query = "SELECT * FROM order_reviews LIMIT 5;"
pd.read_sql(query, conn)
| review_id | order_id | review_score | review_comment_title | review_comment_message | review_creation_date | review_answer_timestamp | |
|---|---|---|---|---|---|---|---|
| 0 | 7bc2406110b926393aa56f80a40eba40 | 73fc7af87114b39712e6da79b0a377eb | 4 | None | None | 2018-01-18 00:00:00 | 2018-01-18 21:46:59 |
| 1 | 80e641a11e56f04c1ad469d5645fdfde | a548910a1c6147796b98fdf73dbeba33 | 5 | None | None | 2018-03-10 00:00:00 | 2018-03-11 03:05:13 |
| 2 | 228ce5500dc1d8e020d8d1322874b6f0 | f9e4b658b201a9f2ecdecbb34bed034b | 5 | None | None | 2018-02-17 00:00:00 | 2018-02-18 14:36:24 |
| 3 | e64fb393e7b32834bb789ff8bb30750e | 658677c97b385a9be170737859d3511b | 5 | None | Recebi bem antes do prazo estipulado. | 2017-04-21 00:00:00 | 2017-04-21 22:02:06 |
| 4 | f7c4243c7fe1938f181bec41a392bdeb | 8e6bfb81e283fa7e4f11123a3fb894f1 | 5 | None | Parabéns lojas lannister adorei comprar pela I... | 2018-03-01 00:00:00 | 2018-03-02 10:26:53 |
query = "PRAGMA table_info(orders)"
pd.read_sql(query, conn)
| cid | name | type | notnull | dflt_value | pk | |
|---|---|---|---|---|---|---|
| 0 | 0 | order_id | TEXT | 0 | None | 0 |
| 1 | 1 | customer_id | TEXT | 0 | None | 0 |
| 2 | 2 | order_status | TEXT | 0 | None | 0 |
| 3 | 3 | order_purchase_timestamp | TEXT | 0 | None | 0 |
| 4 | 4 | order_approved_at | TEXT | 0 | None | 0 |
| 5 | 5 | order_delivered_carrier_date | TEXT | 0 | None | 0 |
| 6 | 6 | order_delivered_customer_date | TEXT | 0 | None | 0 |
| 7 | 7 | order_estimated_delivery_date | TEXT | 0 | None | 0 |
query = "SELECT * FROM orders LIMIT 5;"
pd.read_sql(query, conn)
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | |
|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 00:00:00 |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 00:00:00 |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 00:00:00 |
| 3 | 949d5b44dbf5de918fe9c16f97b45f8a | f88197465ea7920adcdbec7375364d82 | delivered | 2017-11-18 19:28:06 | 2017-11-18 19:45:59 | 2017-11-22 13:39:59 | 2017-12-02 00:28:42 | 2017-12-15 00:00:00 |
| 4 | ad21c59c0840e6cb83a9ceb5573f8159 | 8ab97904e6daea8866dbdbc4fb7aad2c | delivered | 2018-02-13 21:18:39 | 2018-02-13 22:20:29 | 2018-02-14 19:46:34 | 2018-02-16 18:17:02 | 2018-02-26 00:00:00 |
query = "PRAGMA table_info(products)"
pd.read_sql(query, conn)
| cid | name | type | notnull | dflt_value | pk | |
|---|---|---|---|---|---|---|
| 0 | 0 | product_id | TEXT | 0 | None | 0 |
| 1 | 1 | product_category_name | TEXT | 0 | None | 0 |
| 2 | 2 | product_name_lenght | REAL | 0 | None | 0 |
| 3 | 3 | product_description_lenght | REAL | 0 | None | 0 |
| 4 | 4 | product_photos_qty | REAL | 0 | None | 0 |
| 5 | 5 | product_weight_g | REAL | 0 | None | 0 |
| 6 | 6 | product_length_cm | REAL | 0 | None | 0 |
| 7 | 7 | product_height_cm | REAL | 0 | None | 0 |
| 8 | 8 | product_width_cm | REAL | 0 | None | 0 |
query = "SELECT * FROM products LIMIT 5;"
pd.read_sql(query, conn)
| product_id | product_category_name | product_name_lenght | product_description_lenght | product_photos_qty | product_weight_g | product_length_cm | product_height_cm | product_width_cm | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1e9e8ef04dbcff4541ed26657ea517e5 | perfumaria | 40.0 | 287.0 | 1.0 | 225.0 | 16.0 | 10.0 | 14.0 |
| 1 | 3aa071139cb16b67ca9e5dea641aaa2f | artes | 44.0 | 276.0 | 1.0 | 1000.0 | 30.0 | 18.0 | 20.0 |
| 2 | 96bd76ec8810374ed1b65e291975717f | esporte_lazer | 46.0 | 250.0 | 1.0 | 154.0 | 18.0 | 9.0 | 15.0 |
| 3 | cef67bcfe19066a932b7673e239eb23d | bebes | 27.0 | 261.0 | 1.0 | 371.0 | 26.0 | 4.0 | 26.0 |
| 4 | 9dc1a7de274444849c219cff195d0b71 | utilidades_domesticas | 37.0 | 402.0 | 4.0 | 625.0 | 20.0 | 17.0 | 13.0 |
query = "PRAGMA table_info(leads_qualified)"
pd.read_sql(query, conn)
| cid | name | type | notnull | dflt_value | pk | |
|---|---|---|---|---|---|---|
| 0 | 0 | mql_id | TEXT | 0 | None | 0 |
| 1 | 1 | first_contact_date | TEXT | 0 | None | 0 |
| 2 | 2 | landing_page_id | TEXT | 0 | None | 0 |
| 3 | 3 | origin | TEXT | 0 | None | 0 |
query = "SELECT * FROM leads_qualified LIMIT 5;"
pd.read_sql(query, conn)
| mql_id | first_contact_date | landing_page_id | origin | |
|---|---|---|---|---|
| 0 | dac32acd4db4c29c230538b72f8dd87d | 2018-02-01 | 88740e65d5d6b056e0cda098e1ea6313 | social |
| 1 | 8c18d1de7f67e60dbd64e3c07d7e9d5d | 2017-10-20 | 007f9098284a86ee80ddeb25d53e0af8 | paid_search |
| 2 | b4bc852d233dfefc5131f593b538befa | 2018-03-22 | a7982125ff7aa3b2054c6e44f9d28522 | organic_search |
| 3 | 6be030b81c75970747525b843c1ef4f8 | 2018-01-22 | d45d558f0daeecf3cccdffe3c59684aa | |
| 4 | 5420aad7fec3549a85876ba1c529bd84 | 2018-02-21 | b48ec5f3b04e9068441002a19df93c6c | organic_search |
query = "PRAGMA table_info(leads_closed)"
pd.read_sql(query, conn)
| cid | name | type | notnull | dflt_value | pk | |
|---|---|---|---|---|---|---|
| 0 | 0 | mql_id | TEXT | 0 | None | 0 |
| 1 | 1 | seller_id | TEXT | 0 | None | 0 |
| 2 | 2 | sdr_id | TEXT | 0 | None | 0 |
| 3 | 3 | sr_id | TEXT | 0 | None | 0 |
| 4 | 4 | won_date | TEXT | 0 | None | 0 |
| 5 | 5 | business_segment | TEXT | 0 | None | 0 |
| 6 | 6 | lead_type | TEXT | 0 | None | 0 |
| 7 | 7 | lead_behaviour_profile | TEXT | 0 | None | 0 |
| 8 | 8 | has_company | INTEGER | 0 | None | 0 |
| 9 | 9 | has_gtin | INTEGER | 0 | None | 0 |
| 10 | 10 | average_stock | TEXT | 0 | None | 0 |
| 11 | 11 | business_type | TEXT | 0 | None | 0 |
| 12 | 12 | declared_product_catalog_size | REAL | 0 | None | 0 |
| 13 | 13 | declared_monthly_revenue | REAL | 0 | None | 0 |
query = "SELECT * FROM leads_closed LIMIT 10;"
pd.read_sql(query, conn)
| mql_id | seller_id | sdr_id | sr_id | won_date | business_segment | lead_type | lead_behaviour_profile | has_company | has_gtin | average_stock | business_type | declared_product_catalog_size | declared_monthly_revenue | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5420aad7fec3549a85876ba1c529bd84 | 2c43fb513632d29b3b58df74816f1b06 | a8387c01a09e99ce014107505b92388c | 4ef15afb4b2723d8f3d81e51ec7afefe | 2018-02-26 19:58:54 | pet | online_medium | cat | None | None | None | reseller | None | 0.0 |
| 1 | a555fb36b9368110ede0f043dfc3b9a0 | bbb7d7893a450660432ea6652310ebb7 | 09285259593c61296eef10c734121d5b | d3d1e91a157ea7f90548eef82f1955e3 | 2018-05-08 20:17:59 | car_accessories | industry | eagle | None | None | None | reseller | None | 0.0 |
| 2 | 327174d3648a2d047e8940d7d15204ca | 612170e34b97004b3ba37eae81836b4c | b90f87164b5f8c2cfa5c8572834dbe3f | 6565aa9ce3178a5caf6171827af3a9ba | 2018-06-05 17:27:23 | home_appliances | online_big | cat | None | None | None | reseller | None | 0.0 |
| 3 | f5fee8f7da74f4887f5bcae2bafb6dd6 | 21e1781e36faf92725dde4730a88ca0f | 56bf83c4bb35763a51c2baab501b4c67 | d3d1e91a157ea7f90548eef82f1955e3 | 2018-01-17 13:51:03 | food_drink | online_small | None | None | None | None | reseller | None | 0.0 |
| 4 | ffe640179b554e295c167a2f6be528e0 | ed8cb7b190ceb6067227478e48cf8dde | 4b339f9567d060bcea4f5136b9f5949e | d3d1e91a157ea7f90548eef82f1955e3 | 2018-07-03 20:17:45 | home_appliances | industry | wolf | None | None | None | manufacturer | None | 0.0 |
| 5 | b94fba7670eeb44dce2a0d8eb790e9f5 | 1c742ac33582852aaf3bcfbf5893abcf | fdb16d3cbbeb5798f2f66c4096be026d | 495d4e95a8cf8bbf8b432b612a2aa328 | 2018-02-07 18:04:05 | health_beauty | online_medium | None | None | None | None | manufacturer | None | 0.0 |
| 6 | c3e30ed7ac989117c7e1e719b4ac128f | 92d7568ad0c5c76fd7d341b2d46f24d6 | 4b339f9567d060bcea4f5136b9f5949e | 85fc447d336637ba1df43e793199fbc8 | 2018-04-16 18:18:22 | computers | online_medium | wolf | None | None | None | reseller | None | 0.0 |
| 7 | b02c89251106e1fdd9d92744be9f94f2 | 44ed138eca6214d572ce1d813fb0049b | 34d40cdaf94010a1d05b0d6212f9e909 | 4ef15afb4b2723d8f3d81e51ec7afefe | 2018-04-17 17:01:57 | health_beauty | offline | cat | None | None | None | manufacturer | None | 0.0 |
| 8 | a90a37898cc5f2718385a2fb981caaff | 0b28859cd04d23edefee9c591fb03cd8 | f42a2bd194f7802ab052a815c8de65b7 | 6565aa9ce3178a5caf6171827af3a9ba | 2018-05-14 18:37:15 | household_utilities | offline | eagle | None | None | None | reseller | None | 0.0 |
| 9 | 0173e8d8b1d94a355b440fb67388f532 | 87d73636a3acf123e842bb890a4db036 | 9d12ef1a7eca3ec58c545c678af7869c | 9e4d1098a3b0f5da39b0bc48f9876645 | 2018-04-24 03:00:00 | household_utilities | online_medium | eagle | None | None | None | manufacturer | None | 0.0 |
Products with highest revenues¶
- The query below helps to identify the top 10 products generating the highest revenue.
- As a result we understand that product with product_id = 'bb50f2e236e5eea0100680137654686c' generated the highest revenue (63,885.00) from 195 sales.
This product is likely one of the highest-priced or most frequently purchased items.
query = """
SELECT product_id, SUM(price), COUNT(product_id) FROM order_items
GROUP BY product_id
ORDER BY SUM(price) DESC
LIMIT 10 """
cur.execute(query)
cur.fetchall()
[('bb50f2e236e5eea0100680137654686c', 63885.0, 195),
('6cdd53843498f92890544667809f1595', 54730.200000000106, 156),
('d6160fb7873f184099d9bc95e30376af', 48899.34, 35),
('d1c427060a0f73f6b889a5c7c61f2ac4', 47214.50999999998, 343),
('99a4788cb24856965c36a24e339b6058', 43025.56000000037, 488),
('3dd2a17168ec895c781a9191c1e95ad7', 41082.60000000021, 274),
('25c38557cf793876c5abdd5931f922db', 38907.32000000001, 38),
('5f504b3a1c75b73d6151be81eb05bdc9', 37733.90000000001, 63),
('53b36df67ebb7c41585e8d54d6772e08', 37683.42000000013, 323),
('aca2eb7d00ea1a7b8ebd4e68314663af', 37608.900000000314, 527)]
10 most frequently ordered products, regardless of revenue.¶
- The query below highlights the top 10 most frequently ordered products, regardless of revenue.
- The product with product_id = 'aca2eb7d00ea1a7b8ebd4e68314663af' has the highest order count (527), generating 37,608.00 in revenue.
This product might be a low-cost or essential item that sells in large volumes.
query = """
SELECT product_id, SUM(price), COUNT(product_id) FROM order_items
GROUP BY product_id
ORDER BY COUNT(product_id) DESC
LIMIT 10 """
cur.execute(query)
cur.fetchall()
[('aca2eb7d00ea1a7b8ebd4e68314663af', 37608.900000000314, 527),
('99a4788cb24856965c36a24e339b6058', 43025.56000000037, 488),
('422879e10f46682990de24d770e7f83d', 26577.22000000016, 484),
('389d119b48cf3043d311335e499d9c6b', 21440.59000000004, 392),
('368c6c730842d78016ad823897a372db', 21056.80000000004, 388),
('53759a2ecddad2bb87a079a1f1519f73', 20387.200000000033, 373),
('d1c427060a0f73f6b889a5c7c61f2ac4', 47214.50999999998, 343),
('53b36df67ebb7c41585e8d54d6772e08', 37683.42000000013, 323),
('154e7e31ebfa092203795c972e5804a6', 6325.18999999996, 281),
('3dd2a17168ec895c781a9191c1e95ad7', 41082.60000000021, 274)]
- We can conclude that Products with high revenue don't necessarily have the highest order count, and vice versa.
- For example:
Product 'bb50f2e236e5eea0100680137654686c' tops revenue but is only ordered 195 times. Product 'aca2eb7d00ea1a7b8ebd4e68314663af' has the most orders but ranks lower in revenue. This indicates that expensive products contribute more to revenue even with fewer sales, while cheaper products dominate order counts.
Recommendations: For high-revenue products focus marketing efforts on premium buyers and ensure stock availability. For high-order-count products explore upselling or bundling strategies to increase revenue.
Product Categories with highest Revenue¶
- The query below calculates the total revenue total orders for each product category.Categories are ranked in descending order of revenue, returning the top 10.
- We undesrtand that the top category is 'beleza_saude' (health and beauty), generating a total revenue of 1,258,681.34 from 9,670 orders. This suggests that health and beauty products are highly profitable.
'relogios_presentes' (watches and gifts) ranks second with 1,205,005.68 in revenue from 5,991 orders, indicating a smaller but higher-priced product set. 'cama_mesa_banho' (bed, bath, and table) has the most orders (11,115) among the top 3 categories, but it generates slightly less revenue (1,036,988.68) than the other two. This implies these products might be lower in price but sell in higher quantities.
query = """
SELECT products.product_category_name, SUM(order_items.price), COUNT(*)
FROM order_items
JOIN products ON products.product_id = order_items.product_id
GROUP BY products.product_category_name
ORDER BY SUM(order_items.price) DESC
LIMIT 10 """
cur.execute(query)
cur.fetchall()
[('beleza_saude', 1258681.3399999682, 9670),
('relogios_presentes', 1205005.6799999995, 5991),
('cama_mesa_banho', 1036988.6800000712, 11115),
('esporte_lazer', 988048.9700000401, 8641),
('informatica_acessorios', 911954.3200000388, 7827),
('moveis_decoracao', 729762.4900000411, 8334),
('cool_stuff', 635290.8500000009, 3796),
('utilidades_domesticas', 632248.6600000213, 6964),
('automotivo', 592720.1100000107, 4235),
('ferramentas_jardim', 485256.46000001475, 4347)]
Product Categories with highest Revenue in English¶
- The query below a similar query to query above but includes a JOIN with the product_category_name_translation table to get English translations of category names.
- The addition of English names makes the results more client-friendly for non-Portuguese speakers.
query = """
SELECT product_category_name_translation.product_category_name_english, ROUND(SUM(order_items.price),2), COUNT(*)
FROM order_items
JOIN products ON products.product_id = order_items.product_id
JOIN product_category_name_translation ON product_category_name_translation.product_category_name = products.product_category_name
GROUP BY products.product_category_name
ORDER BY SUM(order_items.price) DESC
LIMIT 10 """
cur.execute(query)
cur.fetchall()
[('health_beauty', 1258681.34, 9670),
('watches_gifts', 1205005.68, 5991),
('bed_bath_table', 1036988.68, 11115),
('sports_leisure', 988048.97, 8641),
('computers_accessories', 911954.32, 7827),
('furniture_decor', 729762.49, 8334),
('cool_stuff', 635290.85, 3796),
('housewares', 632248.66, 6964),
('auto', 592720.11, 4235),
('garden_tools', 485256.46, 4347)]
# Plotly Sunburst Visualization
revenue_data = pd.DataFrame({
'product_category': ['health_beauty', 'watches_gifts', 'bed_bath_table',
'sports_leisure', 'computers_accessories', 'furniture_decor',
'cool_stuff', 'housewares', 'auto', 'garden_tools'],
'revenue': [1258681.34, 1205005.68, 1036988.68, 988048.97, 911954.32,
729762.49, 635290.85, 632248.66, 592720.11, 485256.46],
'order_count': [9670, 5991, 11115, 8641, 7827, 8334, 3796, 6964, 4235, 4347]
})
hover_data={'revenue': True, 'order_count': True}
fig = px.sunburst(revenue_data,
path=['product_category'],
values='revenue',
color='order_count',
color_continuous_scale='Viridis',
title='Top 10 Product Categories: Revenue & Orders',
hover_data={'revenue': True, 'order_count': True})
fig.update_traces(textinfo='label+percent parent') # Show percentage relative to the parent category
fig.update_layout(title_font_size=20)
fig.show()
Explanation is below with donut chart's explanation.
#Donut Chart
fig = px.pie(revenue_data,
names='product_category',
values='revenue',
color='product_category',
title='Top 10 Product Categories: Revenue Distribution')
fig.update_traces(hole=0.4, textinfo='percent+label')
fig.update_layout(title_font_size=20, showlegend=True)
fig.show()
The donut chart illustrates the distribution of revenue among the top 10 product categories in the dataset. Each segment represents a product category, with its size corresponding to the total revenue generated by that category. The percentage labels inside the chart indicate each category's share of the total revenue.
From the chart, we observe the following key insights:
Health & Beauty accounts for the largest share of revenue, contributing approximately 14.9% of the total revenue among the top 10 categories. This is followed by Watches & Gifts and Bed, Bath & Table, which contribute 14.2% and 12.2%, respectively. Other categories such as Sports & Leisure and Computers & Accessories also make significant contributions to overall revenue. This visualization highlights which product categories are the most lucrative, providing valuable insights for potential business decisions, such as inventory management, marketing strategies, or promotional activities.
The pie chart also shows the same things, but it's visually harder to understand than donut chart, because here texts are more visible and overall visualisation is easier to understand.
# Plotly Packed Bubble Chart
fig = px.scatter(revenue_data,
x='product_category',
y='revenue',
size='revenue',
color='product_category',
hover_name='product_category',
size_max=60,
title='Top 10 Product Categories by Revenue (Bubble Chart)')
fig.update_traces(marker=dict(opacity=0.7, line=dict(width=1, color='DarkSlateGrey')))
fig.update_layout(title_font_size=20, xaxis_title=None, yaxis_title="Revenue")
fig.update_layout(
height=600)
fig.show()
In case of buuble chart, we can compare the revenues of each category based on the size of the bubble, the bigger the bubble, the bigger the revene. And as the visualizations are for the same dataset ( I was experimenting to find out which is more beautiful/understandable), we can see the same results as of which category has more revenue.
# Animated bubble chart
import numpy as np
revenue_data = pd.DataFrame({
'product_category': ['health_beauty', 'watches_gifts', 'bed_bath_table', 'sports_leisure', 'computers_accessories',
'furniture_decor', 'cool_stuff', 'housewares', 'auto', 'garden_tools'],
'revenue': [1258681.34, 1205005.68, 1036988.68, 988048.97, 911954.32, 729762.49, 635290.85, 632248.66, 592720.11, 485256.46]
})
years = ['2020', '2021', '2022', '2023']
revenue_data_animated = pd.concat([revenue_data.assign(year=year) for year in years], ignore_index=True)
np.random.seed(42)
revenue_data_animated['revenue'] *= (1 + 0.1 * np.random.randn(len(revenue_data_animated)))
fig = px.scatter(
revenue_data_animated,
x='product_category',
y='revenue',
size='revenue',
color='product_category',
animation_frame='year',
hover_name='product_category',
size_max=60,
title='Top 10 Product Categories by Revenue Over Time (Bubble Chart)'
)
fig.update_traces(marker=dict(opacity=0.7, line=dict(width=1, color='DarkSlateGrey')))
fig.update_layout(
title_font_size=20,
xaxis_title=None,
yaxis_title="Revenue",
xaxis={'categoryorder': 'total ascending'}
)
fig.update_layout(
height=600)
fig.show()
This one is the animated version of the bubble chart below. Here the animation frame based on year is added.
# Plotly Funnel Chart
fig = px.funnel(revenue_data,
x='revenue',
y='product_category',
title='Top 10 Product Categories by Revenue (Funnel Chart)',
color='product_category')
fig.update_layout(title_font_size=20, xaxis_title="Revenue", yaxis_title=None)
fig.update_layout(
height=600)
fig.show()
The funnel chart above is a good visualization with the help of which the viewer can easily see the difference of revenues based on product categories. The higher the revenue of the product pategory is, the bigger is its bar and in lower posistion. In my opinion, it is the best visualization in term of easy understanding. It's just simple looking compared of some of the others I had above.
# The radial chart
revenue_data = pd.DataFrame({
'product_category': ['health_beauty', 'watches_gifts', 'bed_bath_table', 'sports_leisure', 'computers_accessories',
'furniture_decor', 'cool_stuff', 'housewares', 'auto', 'garden_tools'],
'revenue': [1258681.34, 1205005.68, 1036988.68, 988048.97, 911954.32, 729762.49, 635290.85, 632248.66, 592720.11, 485256.46]
})
revenue_data_closed = revenue_data.copy()
revenue_data_closed = pd.concat([revenue_data, revenue_data.iloc[0:1]], ignore_index=True)
category_order = list(revenue_data['product_category']) + [revenue_data['product_category'][0]]
fig = px.line_polar(
revenue_data_closed,
r='revenue',
theta='product_category',
line_close=False,
title='Revenue by Product Categories (Radial Chart)',
markers=True,
color_discrete_sequence=px.colors.sequential.Sunset
)
fig.add_scatterpolar(
r=[revenue_data_closed['revenue'].iloc[0]],
theta=[revenue_data_closed['product_category'].iloc[0]],
mode='lines+markers',
line=dict(color=px.colors.sequential.Sunset[0])
)
fig.update_traces(marker=dict(size=8))
fig.update_layout(title_font_size=20, polar=dict(angularaxis=dict(categoryorder='array', categoryarray=category_order)))
fig.show()
The radial chart is also visual representation of revenues based on productcategories, the further is the category from center the bigger is the ammount of its revenue.
Product Categories camparison by number of orders in English¶
- This query calculates the same metrics as query above but ranks categories by the number of orders, not revenue.
Key Insights:
- bed_bath_table ranks first in order count (11,115 orders), despite being third in revenue. This indicates that it’s a high-demand, low-cost category.
- health_beauty drops to second in order count, with 9,670 orders, but still leads in revenue due to higher prices.
- sports_leisure ranks third with 8,641 orders, making it a popular category.
query = """
SELECT product_category_name_translation.product_category_name_english, ROUND(SUM(order_items.price),2), COUNT(*)
FROM order_items
JOIN products ON products.product_id = order_items.product_id
JOIN product_category_name_translation ON product_category_name_translation.product_category_name = products.product_category_name
GROUP BY products.product_category_name
ORDER BY COUNT(*) DESC
LIMIT 10 """
cur.execute(query)
cur.fetchall()
[('bed_bath_table', 1036988.68, 11115),
('health_beauty', 1258681.34, 9670),
('sports_leisure', 988048.97, 8641),
('furniture_decor', 729762.49, 8334),
('computers_accessories', 911954.32, 7827),
('housewares', 632248.66, 6964),
('watches_gifts', 1205005.68, 5991),
('telephony', 323667.53, 4545),
('garden_tools', 485256.46, 4347),
('auto', 592720.11, 4235)]
# Horizontal Bar Chart - Matplotlib
categories = ['bed_bath_table', 'health_beauty', 'sports_leisure', 'furniture_decor',
'computers_accessories', 'housewares', 'watches_gifts', 'telephony',
'garden_tools', 'auto']
orders = [11115, 9670, 8641, 8334, 7827, 6964, 5991, 4545, 4347, 4235]
plt.figure(figsize=(10, 6))
plt.barh(categories, orders, color='skyblue')
plt.xlabel('Number of Orders')
plt.ylabel('Product Categories')
plt.title('Product Categories by Number of Orders')
plt.gca().invert_yaxis()
plt.show()
The bar chaet below helps us to see product categories based on the number of orders. As we can see the category "bed_bath_table" has the highest number of orders, while "auto" has the lowest.
import pandas as pd
import plotly.express as px
# Assuming you have already executed the SQL query and fetched the results
results = [('bed_bath_table', 1036988.68, 11115),
('health_beauty', 1258681.34, 9670),
('sports_leisure', 988048.97, 8641),
('furniture_decor', 729762.49, 8334),
('computers_accessories', 911954.32, 7827),
('housewares', 632248.66, 6964),
('watches_gifts', 1205005.68, 5991),
('telephony', 323667.53, 4545),
('garden_tools', 485256.46, 4347),
('auto', 592720.11, 4235)]
# Create the DataFrame from the results
df = pd.DataFrame(results, columns=['Product Category', 'Revenue', 'Orders'])
# Create the bubble chart
fig = px.scatter(df,
x='Orders',
y='Revenue',
size='Orders',
color='Product Category',
title='Bubble Chart: Product Categories by Orders and Revenue',
labels={'Orders': 'Number of Orders', 'Revenue': 'Total Revenue'})
fig.update_layout(
height=600)
fig.show()
The bubble chart below again shows product categories by orders and revenue. When looking at horizontal axis we can see the numbe rof orders, and by looking at vertical axis we can see the number of total revenue. And beside visually seeing each category's corresponding bubble in the visual in corresponding place based on numbers, we can click on each of bubbles to see exact numbers of that categorts revenue and orders.
Low-performing categories¶
query = """
SELECT product_category_name_translation.product_category_name_english, ROUND(SUM(order_items.price),2), COUNT(*)
FROM order_items
JOIN products ON products.product_id = order_items.product_id
JOIN product_category_name_translation ON product_category_name_translation.product_category_name = products.product_category_name
GROUP BY products.product_category_name
ORDER BY SUM(order_items.price) ASC
LIMIT 10 """
cur.execute(query)
cur.fetchall()
[('security_and_services', 283.29, 2),
('fashion_childrens_clothes', 569.85, 8),
('cds_dvds_musicals', 730.0, 14),
('home_comfort_2', 760.27, 30),
('flowers', 1110.04, 33),
('diapers_and_hygiene', 1567.59, 39),
('arts_and_craftmanship', 1814.01, 24),
('la_cuisine', 2054.99, 14),
('fashion_sport', 2119.51, 30),
('fashio_female_clothing', 2803.64, 48)]
- Recommentations:
For high performing categories focus marketing and inventory management efforts on health_beauty, watches_gifts, and bed_bath_table as they are the top revenue drivers. For bed_bath_table, explore opportunities to increase per-order value through bundles or upselling since it has high volume but relatively low revenue.
For low-performing categories reassess the viability of maintaining categories like security_and_services and cds_dvds_musicals, which contribute minimal revenue and may not justify storage or marketing costs. Explore strategies like discounts or clearance sales to liquidate inventory in these categories.
Insights for Category Expansion: Consider expanding high-margin categories like health_beauty and watches_gifts by introducing new products or targeting similar customer segments.
#Bar chart
low_performing_categories = [
('telephony', 323667.53, 4545),
('garden_tools', 485256.46, 4347),
('auto', 592720.11, 4235),
('watches_gifts', 1205005.68, 5991),
('housewares', 632248.66, 6964),
('furniture_decor', 729762.49, 8334),
('computers_accessories', 911954.32, 7827),
('sports_leisure', 988048.97, 8641),
('health_beauty', 1258681.34, 9670),
('bed_bath_table', 1036988.68, 11115)
]
df_low_performing = pd.DataFrame(low_performing_categories, columns=['Product Category', 'Revenue', 'Orders'])
fig = px.bar(df_low_performing,
x='Product Category',
y='Revenue',
title='Low Performing Product Categories by Revenue',
labels={'Revenue': 'Total Revenue', 'Product Category': 'Product Category'},
color='Product Category')
fig.show()
Here again we have bar chart where we can visually understand the revenue of categores. This time the categories are 10 least performing ones, and the comparison is between them.
Sellers¶
- There are 3,095 sellers in the dataset.
query = "SELECT COUNT(*) FROM sellers;"
pd.read_sql(query, conn)
| COUNT(*) | |
|---|---|
| 0 | 3095 |
Top 10 Sellers by Number of Orders¶
The query below ranks sellers by the number of orders handled. It also provides the total revenue generated by each seller.
Key Insights:
The seller '6560211a19b47992c3666cc44a7e94c0' handled the highest number of orders (2,033) and generated 123,304.83 in revenue.
The second seller '4a3ca9315b744ce9f8e9374361493884' had slightly fewer orders (1,987) but earned significantly higher revenue (200,472.92), suggesting they sell higher-priced products.
The sellers listed handle high order volumes, with counts ranging from 1,171 to 2,033, making them major contributors to the platform's activity.
query = """
SELECT seller_id, SUM(price), COUNT(seller_id) FROM order_items
GROUP BY seller_id
ORDER BY COUNT(seller_id) DESC
LIMIT 10 """
cur.execute(query)
cur.fetchall()
[('6560211a19b47992c3666cc44a7e94c0', 123304.83000000003, 2033),
('4a3ca9315b744ce9f8e9374361493884', 200472.9199999949, 1987),
('1f50f920176fa81dab994f9023523100', 106939.20999999739, 1931),
('cc419e0650a3c5ba77189a1882b7556a', 104288.42000000263, 1775),
('da8622b14eb17ae2831f4ac5b9dab84a', 160236.56999999538, 1551),
('955fee9216a65b617aa5c0531780ce60', 135171.69999999914, 1499),
('1025f0e2d44d7041d6cf58b6550e0bfa', 138968.5499999995, 1428),
('7c67e1448b00f6e969d365cea6b010ab', 187923.8899999995, 1364),
('ea8482cd71df3c1969d7b9473ff13abc', 37177.52000000046, 1203),
('7a67c85e85bb2ce8582c35f2203ad736', 141745.53000000177, 1171)]
Top 10 Sellers by Total Revenue¶
The query below ranks sellers by their total revenue instead of order count. It includes the number of orders to give additional context.
Key Insights:
The top seller by revenue is '4869f7a5dfa277a7dca6462dcf3b52b2', generating 229,472.63 from 1,156 orders. Despite fewer orders compared to the highest order-volume seller, this seller generates significantly more revenue, indicating high-value sales.
The second seller '53243585a1d6dc2643021fd1853d8905' earned 222,776.05 from just 410 orders, showing an even higher average order value than the first.
Sellers with high revenue don't necessarily overlap with those handling the most orders, reinforcing that product pricing and value per order differ widely between sellers.
query = """
SELECT seller_id, SUM(price), COUNT(seller_id) FROM order_items
GROUP BY seller_id
ORDER BY SUM(price) DESC
LIMIT 10 """
cur.execute(query)
cur.fetchall()
[('4869f7a5dfa277a7dca6462dcf3b52b2', 229472.6299999981, 1156),
('53243585a1d6dc2643021fd1853d8905', 222776.04999999952, 410),
('4a3ca9315b744ce9f8e9374361493884', 200472.9199999949, 1987),
('fa1c13f2614d7b5c4749cbc52fecda94', 194042.02999999846, 586),
('7c67e1448b00f6e969d365cea6b010ab', 187923.8899999995, 1364),
('7e93a43ef30c4f03f38b393420bc753a', 176431.86999999982, 340),
('da8622b14eb17ae2831f4ac5b9dab84a', 160236.56999999538, 1551),
('7a67c85e85bb2ce8582c35f2203ad736', 141745.53000000177, 1171),
('1025f0e2d44d7041d6cf58b6550e0bfa', 138968.5499999995, 1428),
('955fee9216a65b617aa5c0531780ce60', 135171.69999999914, 1499)]
Order Status¶
query = "SELECT * FROM orders LIMIT 5;"
pd.read_sql(query, conn)
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | |
|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 00:00:00 |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 00:00:00 |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 00:00:00 |
| 3 | 949d5b44dbf5de918fe9c16f97b45f8a | f88197465ea7920adcdbec7375364d82 | delivered | 2017-11-18 19:28:06 | 2017-11-18 19:45:59 | 2017-11-22 13:39:59 | 2017-12-02 00:28:42 | 2017-12-15 00:00:00 |
| 4 | ad21c59c0840e6cb83a9ceb5573f8159 | 8ab97904e6daea8866dbdbc4fb7aad2c | delivered | 2018-02-13 21:18:39 | 2018-02-13 22:20:29 | 2018-02-14 19:46:34 | 2018-02-16 18:17:02 | 2018-02-26 00:00:00 |
query = """
SELECT order_status FROM orders
GROUP BY order_status
ORDER BY order_status
"""
cur.execute(query)
cur.fetchall()
[('approved',),
('canceled',),
('created',),
('delivered',),
('invoiced',),
('processing',),
('shipped',),
('unavailable',)]
Top Sellers by Revenue based on "delivered" items.¶
- Seller 4a3ca9315b744ce9f8e9374361493884 handled the most orders (1,949), but Seller 4869f7a5dfa277a7dca6462dcf3b52b2 generated the highest revenue ($226,987.93).
- There’s a large variation in the number of orders per seller, suggesting differences in business models or product pricing.
query = """
SELECT seller_id, SUM(price), COUNT(*) FROM order_items
JOIN orders ON orders.order_id = order_items.order_id
WHERE order_status = 'delivered'
GROUP BY seller_id
ORDER BY SUM(price) DESC
LIMIT 5
"""
cur.execute(query)
cur.fetchall()
[('4869f7a5dfa277a7dca6462dcf3b52b2', 226987.92999999813, 1148),
('53243585a1d6dc2643021fd1853d8905', 217940.43999999968, 400),
('4a3ca9315b744ce9f8e9374361493884', 196882.11999999516, 1949),
('fa1c13f2614d7b5c4749cbc52fecda94', 190917.13999999838, 579),
('7c67e1448b00f6e969d365cea6b010ab', 186570.04999999996, 1355)]
Order Status Analysis by Number¶
From the results below we see that the majority of orders (96,478) are delivered, indicating successful transactions.
A significant number of orders are canceled (625) or marked as unavailable (609), which might represent issues in fulfillment or customer cancellations.
Low number of "created" status indicates rapid process of orders and delivery.
query = """
SELECT order_status, COUNT(*) FROM orders
GROUP BY order_status
ORDER BY order_status
"""
cur.execute(query)
cur.fetchall()
[('approved', 2),
('canceled', 625),
('created', 5),
('delivered', 96478),
('invoiced', 314),
('processing', 301),
('shipped', 1107),
('unavailable', 609)]
Top Products by revenue for Delivered items¶
- Product bb50f2e236e5eea0100680137654686c generates the highest revenue but is not the most frequently sold (194 orders).
- From the 10 highest revenue generating delivered items, product 99a4788cb24856965c36a24e339b6058 has the highest sales volume (477 orders) but ranks fifth in revenue, suggesting lower pricing.
query = """
SELECT product_id, SUM(price), COUNT(*) FROM order_items
JOIN orders ON orders.order_id = order_items.order_id
WHERE order_status = 'delivered'
GROUP BY product_id
ORDER BY SUM(price) DESC
LIMIT 10
"""
cur.execute(query)
cur.fetchall()
[('bb50f2e236e5eea0100680137654686c', 63560.0, 194),
('6cdd53843498f92890544667809f1595', 53652.3000000001, 153),
('d6160fb7873f184099d9bc95e30376af', 45949.35, 33),
('d1c427060a0f73f6b889a5c7c61f2ac4', 45620.55999999998, 332),
('99a4788cb24856965c36a24e339b6058', 42049.66000000035, 477),
('3dd2a17168ec895c781a9191c1e95ad7', 40782.80000000021, 272),
('25c38557cf793876c5abdd5931f922db', 38907.32000000001, 38),
('5f504b3a1c75b73d6151be81eb05bdc9', 37733.90000000001, 63),
('53b36df67ebb7c41585e8d54d6772e08', 37454.630000000136, 321),
('aca2eb7d00ea1a7b8ebd4e68314663af', 37104.300000000294, 520)]
query = "SELECT * FROM order_reviews LIMIT 5;"
pd.read_sql(query, conn)
| review_id | order_id | review_score | review_comment_title | review_comment_message | review_creation_date | review_answer_timestamp | |
|---|---|---|---|---|---|---|---|
| 0 | 7bc2406110b926393aa56f80a40eba40 | 73fc7af87114b39712e6da79b0a377eb | 4 | None | None | 2018-01-18 00:00:00 | 2018-01-18 21:46:59 |
| 1 | 80e641a11e56f04c1ad469d5645fdfde | a548910a1c6147796b98fdf73dbeba33 | 5 | None | None | 2018-03-10 00:00:00 | 2018-03-11 03:05:13 |
| 2 | 228ce5500dc1d8e020d8d1322874b6f0 | f9e4b658b201a9f2ecdecbb34bed034b | 5 | None | None | 2018-02-17 00:00:00 | 2018-02-18 14:36:24 |
| 3 | e64fb393e7b32834bb789ff8bb30750e | 658677c97b385a9be170737859d3511b | 5 | None | Recebi bem antes do prazo estipulado. | 2017-04-21 00:00:00 | 2017-04-21 22:02:06 |
| 4 | f7c4243c7fe1938f181bec41a392bdeb | 8e6bfb81e283fa7e4f11123a3fb894f1 | 5 | None | Parabéns lojas lannister adorei comprar pela I... | 2018-03-01 00:00:00 | 2018-03-02 10:26:53 |
Review Score Analysis¶
From the query below we conclude that the majority of orders received a high review score of 5 (57328), accounting for the largest percent of reviews (57.78%).
query = """
SELECT order_reviews.review_score, COUNT(*),
ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM order_reviews), 2) AS percent
FROM orders
JOIN order_reviews on order_reviews.order_id = orders.order_id
GROUP BY review_score
ORDER BY review_score
"""
cur.execute(query)
cur.fetchall()
pd.read_sql(query, conn)
| review_score | COUNT(*) | percent | |
|---|---|---|---|
| 0 | 1 | 11424 | 11.51 |
| 1 | 2 | 3151 | 3.18 |
| 2 | 3 | 8179 | 8.24 |
| 3 | 4 | 19142 | 19.29 |
| 4 | 5 | 57328 | 57.78 |
# Interactive pie chart
data = {
'review_score': [1, 2, 3, 4, 5],
'count': [11424, 3151, 8179, 19142, 57328],
'percent': [11.51, 3.18, 8.24, 19.29, 57.78]
}
df = pd.DataFrame(data)
fig = px.pie(df,
names='review_score',
values='count',
hover_data={'percent': True},
title="Review Score Distribution")
fig.update_traces(textinfo='percent+label', pull=[0.1, 0, 0, 0, 0])
fig.show()
With the help of this pie chart we can see the percentage of teach review score visually.
# A line plot for percentage changes over time
fig = px.line(df,
x='review_score',
y='percent',
markers=True,
title="Review Score Percentages",
labels={'review_score': 'Review Score', 'percent': 'Percentage of Reviews'})
fig.show()
Here we can also see the percantage change in case of revies scores, but I think pie chart is visually better as we have only 5 options in this case.
# Interactive bar chart
data = {
'review_score': [1, 2, 3, 4, 5],
'count': [11424, 3151, 8179, 19142, 57328],
'percent': [11.51, 3.18, 8.24, 19.29, 57.78]
}
df = pd.DataFrame(data)
fig = px.bar(df,
x='review_score',
y='count',
color='review_score',
hover_data={'percent': True},
labels={'review_score': 'Review Score', 'count': 'Number of Reviews'},
title="Review Score Distribution",
text='percent')
fig.update_traces(texttemplate='%{text}%', textposition='outside')
fig.update_layout(
height=600)
fig.show()
Here I have bar chart again showing the revoew score percentages, the bigger the bar the bigger the percentage. And the percantages are also wriiten above each bar.
Sellers City Order Analysis (All Orders)¶
query = """
SELECT sellers.seller_city,
SUM(price) AS Sales,
COUNT(*) AS Count
FROM order_items
JOIN sellers ON sellers.seller_id = order_items.seller_id
GROUP BY sellers.seller_city
ORDER BY SUM(price) DESC
"""
cur.execute(query)
cur.fetchall()
pd.read_sql(query, conn)
| seller_city | Sales | Count | |
|---|---|---|---|
| 0 | sao paulo | 2702878.14 | 27983 |
| 1 | ibitinga | 624592.94 | 7750 |
| 2 | curitiba | 470759.82 | 3016 |
| 3 | rio de janeiro | 358413.59 | 2442 |
| 4 | guarulhos | 329494.38 | 2362 |
| ... | ... | ... | ... |
| 606 | resende | 19.99 | 1 |
| 607 | floranopolis | 19.98 | 2 |
| 608 | jarinu | 14.63 | 1 |
| 609 | macatuba | 13.00 | 1 |
| 610 | palotina | 9.99 | 1 |
611 rows × 3 columns
Seller City Order Analysis (Delivered Orders Only)¶
query = """
SELECT sellers.seller_city,
SUM(price) AS Sales,
COUNT(*) AS Count
FROM order_items
JOIN sellers ON sellers.seller_id = order_items.seller_id
JOIN orders ON orders.order_id = order_items.order_id
WHERE orders.order_status = 'delivered'
GROUP BY sellers.seller_city
ORDER BY SUM(price) DESC
LIMIT 10
"""
cur.execute(query)
cur.fetchall()
pd.read_sql(query, conn)
| seller_city | Sales | Count | |
|---|---|---|---|
| 0 | sao paulo | 2628634.52 | 27357 |
| 1 | ibitinga | 614836.09 | 7621 |
| 2 | curitiba | 458812.03 | 2955 |
| 3 | rio de janeiro | 342341.64 | 2356 |
| 4 | guarulhos | 318217.59 | 2309 |
| 5 | ribeirao preto | 267478.85 | 2208 |
| 6 | itaquaquecetuba | 228460.34 | 1639 |
| 7 | guariba | 226987.93 | 1148 |
| 8 | santo andre | 223280.71 | 2886 |
| 9 | lauro de freitas | 220689.44 | 401 |
- Comparison: All Orders vs. Delivered Orders
Sao Paulo remains the top-performing city in both total sales and transaction count, contributing the most significant share.
Sales figures for "delivered orders" are slightly lower than those for "all orders," indicating some incomplete or canceled orders, particularly in larger markets.
Ibatinga, Curitiba, and Rio de Janeiro consistently follow Sao Paulo, showcasing high transactional volume and revenue.
- Insights and Recommendations:
Top Markets (Sao Paulo, Ibatinga, Curitiba): These cities contribute significantly to revenue and sales volume. Strategies such as targeted marketing campaigns, loyalty programs, or promotional offers in these regions could amplify growth further.
Smaller Markets (Guariba, Lauro de Freitas): Cities like Guariba and Lauro de Freitas have high average order values despite lower transaction counts. Investigating seller or product types in these areas could reveal niche opportunities to expand.
Delivered Orders vs. All Orders: A small proportion of sales occurs from non-delivered orders. Investigate order cancellations or pending orders in top-performing cities (e.g., Sao Paulo or Ibatinga) to address potential inefficiencies.
# Matplotlib Visualization
query_city = """
SELECT sellers.seller_city,
SUM(price) AS Sales,
COUNT(*) AS Count
FROM order_items
JOIN sellers ON sellers.seller_id = order_items.seller_id
JOIN orders ON orders.order_id = order_items.order_id
WHERE orders.order_status = 'delivered'
GROUP BY sellers.seller_city
ORDER BY SUM(price) DESC
LIMIT 10
"""
city_data = pd.read_sql(query_city, conn)
plt.figure(figsize=(10, 6))
plt.barh(city_data['seller_city'], city_data['Sales'], color='darkorange')
plt.xlabel('Total Sales (in BRL)', fontsize=12)
plt.ylabel('Seller City', fontsize=12)
plt.title('Top 10 Seller Cities by Sales (Delivered Orders)', fontsize=16)
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()
With the help of this chart we can easily see top seller cities by sales. The bigger the bar the bigger the revenue.
# Seaborn Visualization
plt.figure(figsize=(10, 6))
sns.barplot(x='Count', y='seller_city', data=city_data, palette='coolwarm')
plt.xlabel('Number of Delivered Orders', fontsize=12)
plt.ylabel('Seller City', fontsize=12)
plt.title('Top 10 Seller Cities by Number of Delivered Orders', fontsize=16)
plt.tight_layout()
plt.show()
Here we can see the same results, I used seaborn here, wereas in earlieer chart I used matplotlip visualization.
# Plotly Visualization
fig = px.bar(city_data,
x='Sales',
y='seller_city',
orientation='h',
color='Sales',
color_continuous_scale='Viridis',
title='Top 10 Seller Cities by Sales (Delivered Orders)',
labels={'Sales': 'Total Sales (in BRL)', 'seller_city': 'Seller City'})
fig.update_layout(title_font_size=20,
xaxis_title='Total Sales (in BRL)',
yaxis_title='Seller City',
yaxis=dict(categoryorder='total ascending'))
fig.show()
Here again we can see the the visualization showing the top 10 seller cities by revenue again, and this time I used Plotly .
Number of total orders, delivered orders and the percent of delivered orders¶
query = """
SELECT
Count(order_id) AS 'Total orders',
(SELECT COUNT(*) FROM orders WHERE orders.order_status = 'delivered') AS 'Delivered Orders',
(SELECT COUNT(*) FROM orders WHERE orders.order_status = 'delivered')*100 / Count(order_id) AS 'Delivered %'
FROM orders
"""
cur.execute(query)
cur.fetchall()
pd.read_sql(query, conn)
| Total orders | Delivered Orders | Delivered % | |
|---|---|---|---|
| 0 | 99441 | 96478 | 97 |
Number and percentage of canceled orders¶
query = """
SELECT
Count(order_id) AS 'Total orders',
(SELECT COUNT(*) FROM orders WHERE orders.order_status = 'canceled') AS 'Canceled Orders',
(SELECT COUNT(*) FROM orders WHERE orders.order_status = 'canceled')*100 / Count(order_id) AS 'Canceled %'
FROM orders
"""
cur.execute(query)
cur.fetchall()
[(99441, 625, 0)]
Order Status Analysis¶
Total orders: 99,441 (This is the total number of orders in the system.)
Delivered orders: 96,478 Delivered percentage: 97% This means that a significant portion (97%) of the orders have been successfully delivered, indicating an efficient order fulfillment process.
Canceled orders: 625 Canceled percentage: 0% The number of canceled orders is minimal, only 0.63% of the total orders, which is a good indicator of a relatively low cancellation rate.
- Summary Insights:
High delivery success rate: 97% of orders are delivered, suggesting that the system is functioning well in terms of processing and shipping orders. Low cancellation rate: With only 625 canceled orders, the cancellation rate is very low (0%), which can be considered a positive sign in terms of operational efficiency and customer satisfaction.
Analysis of Qualified Leads by Origin¶
query = """
SELECT origin,
COUNT(*),
ROUND(CAST(COUNT(*)*100.0/ (SELECT COUNT(*) FROM leads_qualified) AS FLOAT),3) AS Percentage
FROM leads_qualified
GROUP BY origin
ORDER BY COUNT(*) DESC
"""
cur.execute(query)
cur.fetchall()
pd.read_sql(query, conn)
| origin | COUNT(*) | Percentage | |
|---|---|---|---|
| 0 | organic_search | 2296 | 28.700 |
| 1 | paid_search | 1586 | 19.825 |
| 2 | social | 1350 | 16.875 |
| 3 | unknown | 1099 | 13.738 |
| 4 | direct_traffic | 499 | 6.238 |
| 5 | 493 | 6.163 | |
| 6 | referral | 284 | 3.550 |
| 7 | other | 150 | 1.875 |
| 8 | display | 118 | 1.475 |
| 9 | other_publicities | 65 | 0.813 |
| 10 | None | 60 | 0.750 |
# The donut chart
query = """
SELECT origin,
COUNT(*),
ROUND(CAST(COUNT(*)*100.0/ (SELECT COUNT(*) FROM leads_qualified) AS FLOAT),3) AS Percentage
FROM leads_qualified
GROUP BY origin
ORDER BY COUNT(*) DESC
"""
origin_data = pd.read_sql(query, conn)
fig = px.pie(origin_data, values='Percentage', names='origin',
title='Qualified Leads Percentage by Origin',
hole=0.4)
fig.update_traces(textinfo='percent+label')
fig.update_layout(title_font_size=20)
fig.show()
With the help of donut chart we can see the qualified leads percentage by origin, each one has its defined color, and the percentages are written in a way that visually are easy to see and understand immediately.
Analysis of Closed Leads by Business Segment¶
query = """
SELECT COUNT(*), business_segment,
ROUND(COUNT(*) *100.0 / (SELECT COUNT(*) FROM leads_closed),2) AS 'Percentage'
FROM leads_closed
GROUP BY business_segment
ORDER BY COUNT(*) DESC
LIMIT 10
"""
cur.execute(query)
cur.fetchall()
pd.read_sql(query, conn)
| COUNT(*) | business_segment | Percentage | |
|---|---|---|---|
| 0 | 105 | home_decor | 12.47 |
| 1 | 93 | health_beauty | 11.05 |
| 2 | 77 | car_accessories | 9.14 |
| 3 | 71 | household_utilities | 8.43 |
| 4 | 69 | construction_tools_house_garden | 8.19 |
| 5 | 64 | audio_video_electronics | 7.60 |
| 6 | 34 | computers | 4.04 |
| 7 | 30 | pet | 3.56 |
| 8 | 28 | food_supplement | 3.33 |
| 9 | 26 | food_drink | 3.09 |
Analysis for two above queries¶
leads qualified by origin¶
1.Top Origins by Count:
Organic Search: 2,296 leads (28.7% of the total leads) Paid Search: 1,586 leads (19.83%) Social: 1,350 leads (16.88%) Unknown: 1,099 leads (13.74%) These origins are significant in driving qualified leads. Organic search, paid search, and social media contribute almost 65% of the total leads.
2.Other Origins:
Direct Traffic: 499 leads (6.24%) Email: 493 leads (6.16%) Referral: 284 leads (3.55%) Display: 118 leads (1.48%) Other Publicities: 65 leads (0.81%) Unknown: 60 leads (0.75%) These categories represent a smaller portion of the leads but are still valuable, especially for understanding how less conventional sources contribute to lead generation.
Leads Closed by Business Segment:¶
1.Top Business Segments by Count:
Home Decor: 105 closed leads (12.47%) Health & Beauty: 93 closed leads (11.05%) Car Accessories: 77 closed leads (9.14%) Household Utilities: 71 closed leads (8.43%) Construction Tools & House/Garden: 69 closed leads (8.19%) These business segments make up a substantial portion of closed leads. Home Decor and Health & Beauty together account for 23.5% of the closed leads.
- Other Business Segments:
Audio/Video Electronics: 64 leads (7.60%) Computers: 34 leads (4.04%) Pet: 30 leads (3.56%) Food Supplement: 28 leads (3.33%) Food & Drink: 26 leads (3.09%) Sports & Leisure: 25 leads (2.97%) Fashion Accessories: 20 leads (2.38%) Several other segments contribute a smaller but still meaningful portion to the business, with notable entries like Food & Drink, Pet, and Sports & Leisure.
- Insights & Recommendations:
Marketing Focus: The top three origins (Organic Search, Paid Search, and Social) are the primary drivers of qualified leads. Continuing to optimize for these channels, with a particular focus on improving organic search rankings and paid search campaigns, can yield even more qualified leads.
Efforts to further investigate and enhance the "Unknown" category could uncover valuable opportunities for better lead classification and targeting.
Sales Segmentation: Home Decor, Health & Beauty, and Car Accessories are the leading business segments in terms of closed leads. These segments should be prioritized for further marketing and sales efforts, as they demonstrate strong conversion potential.
Other segments like Audio/Video Electronics, Computers, and Pet could be explored for different campaigns that might improve conversions or attract more leads.
query_segment = """
SELECT COUNT(*) AS lead_count, business_segment,
ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM leads_closed), 2) AS percentage
FROM leads_closed
GROUP BY business_segment
ORDER BY lead_count DESC
LIMIT 10;
"""
segment_data = pd.read_sql(query_segment, conn)
#barplot
plt.figure(figsize=(10, 6))
plt.barh(segment_data['business_segment'], segment_data['lead_count'], color='teal')
plt.xlabel('Number of Closed Leads', fontsize=12)
plt.ylabel('Business Segment', fontsize=12)
plt.title('Closed Leads by Business Segment', fontsize=16)
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()
fig = px.pie(segment_data, values='percentage', names='business_segment',
title='Closed Leads Percentage by Business Segment')
fig.update_traces(textinfo='percent+label')
fig.update_layout(title_font_size=20)
fig.show()
Here in case of both the bar plot and pie plot we can visually see the closed leads percentage bu business segment, the bigger the size of bar/pie part, the bigger the percentage. So home_decor has highest percentage, and food_drink has lowest among 10.